import numpy as np
import pandas as pd
from itertools import cycle
from scipy import interp
# sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
# TensorFlow
import tensorflow as tf
# Timer
from timeit import default_timer as timer
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
from matplotlib import cm
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we demonstrate solving a classification problem in TensorFlow using Estimators using the Heart Disease Dataset from the UCI Machine Learning Repository.

Picture Source: harvard.edu
The object of the exercise is to develop a predictive model that can predict whether heart disease is present or absent based on the rest of the given features.
Data = np.genfromtxt('heart-disease/heart.dat', delimiter=' ')
Attributes = ['Age', 'Sex', 'Chest Pain Type', 'Resting Blood Pressure', 'Serum Cholestoral',
'Fasting Blood Sugar', 'Resting Electrocardiographic Results', 'Maximum Heart Rate Achieved',
'Exercise Induced Angina', 'Oldpeak', 'Slope',
'Number of Major Vessels', 'Thal', 'Heart Disease']
Data = pd.DataFrame(data = Data, columns = Attributes)
#
Temp = ['Sex', 'Chest Pain Type', 'Fasting Blood Sugar', 'Resting Electrocardiographic Results',
'Exercise Induced Angina', 'Slope', 'Number of Major Vessels','Thal']
for c in Temp:
Data[c] = Data[c].astype(int).astype(str)
del Temp, c
Target = 'Heart Disease'
Labels_dict = dict(zip([0,1],['Absent', 'Present']))
Data['Heart Disease'] = (Data['Heart Disease']-1).astype(int)
#
display(Data.head(5))
display(pd.DataFrame({'Number of Instances': [Data.shape[0]], 'Number of Attributes': [Data.shape[1]]}).style.hide_index())
# Maps
Maps = {'Sex': {'0':'Female', '1':'Male'},
'Chest Pain Type': {'1':'Typical Angina', '2':'Atypical Angina', '3': 'Non-Anginal Pain', '4':'Asymptomatic'},
'Fasting Blood Sugar': {'0': 'False', '1': 'True'}, 'Exercise Induced Angina': {'0': 'No', '1': 'Yes'},
'Slope': {'1': 'Upsloping', '2': 'Flat', '3': 'Downsloping'},
'Thal': {'3': 'Normal', '6': 'Fixed Defect','7': 'Reversable Defect'}}
| Age | Sex | Chest Pain Type | Resting Blood Pressure | Serum Cholestoral | Fasting Blood Sugar | Resting Electrocardiographic Results | Maximum Heart Rate Achieved | Exercise Induced Angina | Oldpeak | Slope | Number of Major Vessels | Thal | Heart Disease | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 70.0 | 1 | 4 | 130.0 | 322.0 | 0 | 2 | 109.0 | 0 | 2.4 | 2 | 3 | 3 | 1 |
| 1 | 67.0 | 0 | 3 | 115.0 | 564.0 | 0 | 2 | 160.0 | 0 | 1.6 | 2 | 0 | 7 | 0 |
| 2 | 57.0 | 1 | 2 | 124.0 | 261.0 | 0 | 0 | 141.0 | 0 | 0.3 | 1 | 0 | 7 | 1 |
| 3 | 64.0 | 1 | 4 | 128.0 | 263.0 | 0 | 0 | 105.0 | 1 | 0.2 | 2 | 1 | 7 | 0 |
| 4 | 74.0 | 0 | 2 | 120.0 | 269.0 | 0 | 2 | 121.0 | 1 | 0.2 | 1 | 1 | 3 | 0 |
| Number of Instances | Number of Attributes |
|---|---|
| 270 | 14 |
def Data_Plot(Inp, Title = None, W = None):
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info = data_info.reset_index(drop = False).rename(columns = {'index':'Features'})
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type',
text = 'Percentage',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1))
if not W == None:
fig.update_layout(width = W)
fig.update_traces(texttemplate= 10*' ' + '%%{text}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
if not Title == None:
fig.update_layout(title={'text': '<b>' + Title + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
return data_info
data_info = Data_Plot(Data, Title = 'Heart Disease Dataset', W = 800)
# A copy of the Dataframe
df = Data.copy()
# for TF
df.columns = [x.replace(' ','_') for x in df.columns]
Temp = Target.replace(' ','_')
X = df.drop(columns = Temp)
y = df[Temp].values
del df, Temp
def DatasetTargetDist(Inp, Target, Labels_dict, PD):
# Table
Table = Inp[Target].value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Target})
Table[Target] = Table[Target].replace(Labels_dict)
Table['Percentage'] = np.round(100*(Table['Count']/Table['Count'].sum()),2)
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=PD['column_widths'],
specs=[[{"type": "table"},{"type": "pie"}]])
# Right
fig.add_trace(go.Pie(labels=Table[Target].values, values=Table['Count'].values,
pull=PD['pull'], textfont=dict(size= PD['textfont']),
marker=dict(colors = PD['PieColors'], line=dict(color='black', width=1))), row=1, col=2)
fig.update_traces(hole=PD['hole'])
fig.update_layout(height = PD['height'], legend=dict(orientation="v"), legend_title_text= PD['legend_title'])
# Left
T = Table.copy()
T['Percentage'] = T['Percentage'].map(lambda x: '%%%.2f' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= PD['TableColors'][0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = PD['tablecolumnwidth'],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [PD['TableColors'][1], PD['TableColors'][1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + Target + '<b>', 'x':PD['title_x'],
'y':PD['title_y'], 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Pull = [0 for x in range((len(Labels_dict)-1))]
Pull.append(.05)
PD = dict(PieColors = ['SeaGreen','FireBrick'],
TableColors = ['Navy','White'], hole = .4,
column_widths=[0.6, 0.4],textfont = 14, height = 350, tablecolumnwidth = [0.20, 0.12, 0.15],
pull = Pull, legend_title = Target, title_x = 0.5, title_y = 0.8)
del Pull
DatasetTargetDist(Data, Target, Labels_dict, PD)
StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
# For Tensorflow
X.columns = [x.replace(' ','_') for x in X.columns]
for train_index, test_index in sss.split(X, y):
# X
if isinstance(X, pd.DataFrame):
X_train, X_test = X.loc[train_index], X.loc[test_index]
else:
X_train, X_test = X[train_index], X[test_index]
# y
if isinstance(y, pd.Series):
y_train, y_test = y[train_index], y[test_index]
else:
y_train, y_test = y[train_index], y[test_index]
del sss
def Train_Test_Dist(X_train, y_train, X_test, y_test, PD, Labels_dict = Labels_dict):
def ToSeries(x):
if not isinstance(x, pd.Series):
Out = pd.Series(x)
else:
Out = x.copy()
return Out
fig = make_subplots(rows=1, cols=3, horizontal_spacing = 0.02, column_widths= PD['column_widths'],
specs=[[{"type": "table"},{'type':'domain'}, {'type':'domain'}]])
# Right
C = 2
for y in [ToSeries(y_train).replace(Labels_dict), ToSeries(y_test).replace(Labels_dict)]:
fig.add_trace(go.Pie(labels= list(Labels_dict.values()),
values= y.value_counts().values, pull=PD['pull'],
textfont=dict(size=PD['textfont']),
marker=dict(colors = PD['PieColors'],
line=dict(color='black', width=1))), row=1, col=C)
fig.update_traces(hole=.5)
fig.update_layout(legend=dict(orientation="v"), legend_title_text= PD['legend_title'])
C+=1
# Left
# Table
Table = pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).astype(str)
T = Table.copy()
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
TableColors = PD['TableColors']
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= TableColors[0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = PD['tablecolumnwidth'],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [TableColors[1], TableColors[1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + 'Dataset Distribution' + '<b>', 'x':PD['title_x'],
'y':PD['title_y'], 'xanchor': 'center', 'yanchor': 'top'})
if not PD['height'] == None:
fig.update_layout(height = PD['height'])
fig.show()
PD.update(dict(column_widths=[0.3, 0.3, 0.3], tablecolumnwidth = [0.2, 0.4], height = 350, legend_title = Target))
Train_Test_Dist(X_train, y_train, X_test, y_test, PD)
Here, we use the Tensorflow Linear classifier model.tf.estimator.LinearClassifier.
The input function specifies how data is converted to a tf.data.Dataset that feeds the input pipeline in a streaming fashion. Moreover, an input function is a function that returns a tf.data.Dataset object which outputs the following two-element tuple:
def input_fn(features, labels, training=True, batch_size=256):
"""An input function for training or evaluating"""
# Convert the inputs to a Dataset.
dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
# Shuffle and repeat if you are in training mode.
if training:
dataset = dataset.shuffle(1000).repeat()
return dataset.batch(batch_size)
Moreover, an estimator model consists of two main parts, feature columns, and a numeric vector. Feature columns provide explanations for the input numeric vector. The following function separates categorical and numerical columns (features)and returns a descriptive list of feature columns.
def Feat_Columns(Inp):
Temp = Inp.dtypes.reset_index(drop = False)
Temp.columns = ['Features', 'Data Type']
Temp['Data Type'] = Temp['Data Type'].astype(str)
# Numeric_Columns
Numeric_Columns = Temp.loc[Temp['Data Type'].isin(['int64', 'int32', 'float64', 'float32']),'Features'].tolist()
# Categorical_Columns
Categorical_Columns = Temp.loc[Temp['Data Type'] == 'object','Features'].tolist()
# Feature Columns
feature_columns = []
if len(Categorical_Columns)>0:
for feature_name in Categorical_Columns:
vocabulary = Inp[feature_name].unique()
feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))
if len(Numeric_Columns)>0:
for feature_name in Numeric_Columns:
feature_columns.append(tf.feature_column.numeric_column(feature_name))
return feature_columns
my_feature_columns = Feat_Columns(X)
tf.keras.backend.clear_session()
IT = int(5e3)
classifier = tf.estimator.LinearClassifier(feature_columns=my_feature_columns)
classifier.train(input_fn=lambda: input_fn(X_train, y_train, training=True), max_steps = IT)
result = classifier.evaluate(input_fn=lambda: input_fn(X_test, y_test, training=False))
clear_output()
display(pd.DataFrame(result, index = ['']).round(4))
| accuracy | accuracy_baseline | auc | auc_precision_recall | average_loss | label/mean | loss | precision | prediction/mean | recall | global_step | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.8765 | 0.5556 | 0.9148 | 0.846 | 0.4334 | 0.4444 | 0.4334 | 0.825 | 0.5008 | 0.9167 | 5000 |
def ROC_Curve(y_test, probs, n_classes, FS = 7, ax = False, pad = 0.01):
# converting y_test to categorical
y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes=n_classes, dtype='float32')
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = metrics.roc_curve(y_test_cat[:, i], probs[:, i])
roc_auc[i] = metrics.auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = metrics.roc_curve(y_test_cat.ravel(), probs.ravel())
roc_auc["micro"] = metrics.auc(fpr["micro"], tpr["micro"])
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = metrics.auc(fpr["macro"], tpr["macro"])
fig = go.Figure()
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], name = 'FPR = TPR', line = dict(color='Black', width=2, dash='dash')))
fig.add_trace(go.Scatter(x=fpr["micro"], y=tpr["micro"], mode='lines', marker_color = 'deeppink',
name='micro-average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"])))
fig.add_trace(go.Scatter(x=fpr["macro"], y=tpr["macro"], mode='lines', marker_color = 'navy',
name='macro-average ROC curve (area = {0:0.2f})'.format(roc_auc["macro"])))
colors = cycle(['Aqua', 'DarkOrange', 'CornflowerBlue'])
for i, color in zip(range(n_classes), colors):
_ = fig.add_trace(go.Scatter(x = fpr[i], y = tpr[i], mode='lines', marker_color= px.colors.sequential.Rainbow,
name='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i])))
# Background
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range =[-pad, 1+pad],
title = 'False Positive Rate (FPR)')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range =[-pad, 1+pad],
title = 'True Positive Rate (TPR)')
fig.update_yaxes(scaleanchor = "x", scaleratio = 1)
fig.update_layout(height = 600, width = 810)
fig.update_layout(title={'text': '<b>' + 'Receiver Operating Characteristic (ROC) Curves' + '<b>', 'x': .5,
'y': .9, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_test, y_test, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
ROC_Curve(y_test, probs, n_classes = len(Labels_dict), FS = 8)
The confusion matrix allows for visualization of the performance of an algorithm. Note that due to the size of data, here we don't provide a Cross-validation evaluation. In general, this type of evaluation is preferred.
def Confusion_Mat(CM_Train, CM_Test, PD, n_splits = 10):
if n_splits == None:
Titles = ['Train Set', 'Test Set']
else:
Titles = ['Train Set (CV = % i)' % n_splits, 'Test Set (CV = % i)' % n_splits]
CM = [CM_Train, CM_Test]
Cmap = ['Greens', 'YlGn','Blues', 'PuBu']
for i in range(2):
fig, ax = plt.subplots(1, 2, figsize= PD['FS'])
fig.suptitle(Titles[i], weight = 'bold', fontsize = 16)
_ = sns.heatmap(CM[i], annot=True, annot_kws={"size": PD['annot_kws']}, cmap=Cmap[2*i], ax = ax[0],
linewidths = 0.2, cbar_kws={"shrink": PD['shrink']})
_ = ax[0].set_title('Confusion Matrix');
Temp = np.round(CM[i].astype('float') / CM[i].sum(axis=1)[:, np.newaxis], 2)
_ = sns.heatmap(Temp,
annot=True, annot_kws={"size": PD['annot_kws']}, cmap=Cmap[2*i+1], ax = ax[1],
linewidths = 0.4, vmin=0, vmax=1, cbar_kws={"shrink": PD['shrink']})
_ = ax[1].set_title('Normalized Confusion Matrix');
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels');
_ = a.xaxis.set_ticklabels(PD['Labels'])
_ = a.yaxis.set_ticklabels(PD['Labels'])
_ = a.set_aspect(1)
# Train
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_train, y_train, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
y_pred = np.argmax(probs, axis = 1).reshape(-1,1)
Reports_Train = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names=list(Labels_dict.values()),
output_dict=True)).T
CM_Train = metrics.confusion_matrix(y_train, y_pred)
# Test
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_test, y_test, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
y_pred = np.argmax(probs, axis = 1).reshape(-1,1)
Reports_Test = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=list(Labels_dict.values()),
output_dict=True)).T
CM_Test = metrics.confusion_matrix(y_test, y_pred)
Reports_Train = Reports_Train.reset_index().rename(columns ={'index': 'Train Set'})
Reports_Test = Reports_Test.reset_index().rename(columns ={'index': 'Test Set'})
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set'], **{'background-color': 'SeaGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set'], **{'background-color': 'RoyalBlue', 'color': 'White'}))
PD = dict(FS = (10, 6), annot_kws = 14, shrink = .6, Labels = list(Labels_dict.values()))
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = None)
| Train Set | precision | recall | f1-score | support |
|---|---|---|---|---|
| Absent | 0.858407 | 0.923810 | 0.889908 | 105.000000 |
| Present | 0.894737 | 0.809524 | 0.850000 | 84.000000 |
| accuracy | 0.873016 | 0.873016 | 0.873016 | 0.873016 |
| macro avg | 0.876572 | 0.866667 | 0.869954 | 189.000000 |
| weighted avg | 0.874554 | 0.873016 | 0.872171 | 189.000000 |
| Test Set | precision | recall | f1-score | support |
|---|---|---|---|---|
| Absent | 0.926829 | 0.844444 | 0.883721 | 45.000000 |
| Present | 0.825000 | 0.916667 | 0.868421 | 36.000000 |
| accuracy | 0.876543 | 0.876543 | 0.876543 | 0.876543 |
| macro avg | 0.875915 | 0.880556 | 0.876071 | 81.000000 |
| weighted avg | 0.881572 | 0.876543 | 0.876921 | 81.000000 |
The Follow the Regularized Leader (FTRL) model is an implementation of the FTRL-Proximal online learning algorithm for binomial logistic regression (for details see [6]).
tf.keras.backend.clear_session()
IT = int(5e3)
classifier = tf.estimator.LinearClassifier(feature_columns=my_feature_columns,
optimizer=tf.keras.optimizers.Ftrl(learning_rate=0.1, l1_regularization_strength=0.001))
#
classifier.train(input_fn=lambda: input_fn(X_train, y_train, training=True), max_steps = IT)
result = classifier.evaluate(input_fn=lambda: input_fn(X_test, y_test, training=False))
clear_output()
display(pd.DataFrame(result, index = ['']).round(4))
| accuracy | accuracy_baseline | auc | auc_precision_recall | average_loss | label/mean | loss | precision | prediction/mean | recall | global_step | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.8765 | 0.5556 | 0.9247 | 0.8833 | 0.4117 | 0.4444 | 0.4117 | 0.8095 | 0.528 | 0.9444 | 5000 |
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_test, y_test, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
ROC_Curve(y_test, probs, n_classes = len(Labels_dict), FS = 8)
# Train
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_train, y_train, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
y_pred = np.argmax(probs, axis = 1).reshape(-1,1)
Reports_Train = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names=list(Labels_dict.values()),
output_dict=True)).T
CM_Train = metrics.confusion_matrix(y_train, y_pred)
# Test
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_test, y_test, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
y_pred = np.argmax(probs, axis = 1).reshape(-1,1)
Reports_Test = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=list(Labels_dict.values()),
output_dict=True)).T
CM_Test = metrics.confusion_matrix(y_test, y_pred)
Reports_Train = Reports_Train.reset_index().rename(columns ={'index': 'Train Set'})
Reports_Test = Reports_Test.reset_index().rename(columns ={'index': 'Test Set'})
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set'], **{'background-color': 'SeaGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set'], **{'background-color': 'RoyalBlue', 'color': 'White'}))
PD = dict(FS = (10, 6), annot_kws = 14, shrink = .6, Labels = list(Labels_dict.values()))
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = None)
| Train Set | precision | recall | f1-score | support |
|---|---|---|---|---|
| Absent | 0.878505 | 0.895238 | 0.886792 | 105.000000 |
| Present | 0.865854 | 0.845238 | 0.855422 | 84.000000 |
| accuracy | 0.873016 | 0.873016 | 0.873016 | 0.873016 |
| macro avg | 0.872179 | 0.870238 | 0.871107 | 189.000000 |
| weighted avg | 0.872882 | 0.873016 | 0.872850 | 189.000000 |
| Test Set | precision | recall | f1-score | support |
|---|---|---|---|---|
| Absent | 0.948718 | 0.822222 | 0.880952 | 45.000000 |
| Present | 0.809524 | 0.944444 | 0.871795 | 36.000000 |
| accuracy | 0.876543 | 0.876543 | 0.876543 | 0.876543 |
| macro avg | 0.879121 | 0.883333 | 0.876374 | 81.000000 |
| weighted avg | 0.886854 | 0.876543 | 0.876882 | 81.000000 |
tf.keras.backend.clear_session()
IT = int(5e3)
classifier = tf.estimator.LinearClassifier(feature_columns=my_feature_columns,
optimizer=lambda: tf.keras.optimizers.Adam(learning_rate=tf.compat.v1.train.exponential_decay(learning_rate=0.1,
global_step=tf.compat.v1.train.get_global_step(), decay_steps=IT,decay_rate=0.96)))
#
classifier.train(input_fn=lambda: input_fn(X_train, y_train, training=True), max_steps = IT)
result = classifier.evaluate(input_fn=lambda: input_fn(X_test, y_test, training=False))
clear_output()
display(pd.DataFrame(result, index = ['']).round(4))
| accuracy | accuracy_baseline | auc | auc_precision_recall | average_loss | label/mean | loss | precision | prediction/mean | recall | global_step | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.8148 | 0.5556 | 0.8843 | 0.7893 | 0.9979 | 0.4444 | 0.9979 | 0.7234 | 0.595 | 0.9444 | 5000 |
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_test, y_test, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
ROC_Curve(y_test, probs, n_classes = len(Labels_dict), FS = 8)
# Train
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_train, y_train, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
y_pred = np.argmax(probs, axis = 1).reshape(-1,1)
Reports_Train = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names=list(Labels_dict.values()),
output_dict=True)).T
CM_Train = metrics.confusion_matrix(y_train, y_pred)
# Test
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_test, y_test, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])
y_pred = np.argmax(probs, axis = 1).reshape(-1,1)
Reports_Test = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=list(Labels_dict.values()),
output_dict=True)).T
CM_Test = metrics.confusion_matrix(y_test, y_pred)
Reports_Train = Reports_Train.reset_index().rename(columns ={'index': 'Train Set'})
Reports_Test = Reports_Test.reset_index().rename(columns ={'index': 'Test Set'})
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set'], **{'background-color': 'SeaGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set'], **{'background-color': 'RoyalBlue', 'color': 'White'}))
PD = dict(FS = (10, 6), annot_kws = 14, shrink = .6, Labels = list(Labels_dict.values()))
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = None)
| Train Set | precision | recall | f1-score | support |
|---|---|---|---|---|
| Absent | 0.912088 | 0.790476 | 0.846939 | 105.000000 |
| Present | 0.775510 | 0.904762 | 0.835165 | 84.000000 |
| accuracy | 0.841270 | 0.841270 | 0.841270 | 0.841270 |
| macro avg | 0.843799 | 0.847619 | 0.841052 | 189.000000 |
| weighted avg | 0.851387 | 0.841270 | 0.841706 | 189.000000 |
| Test Set | precision | recall | f1-score | support |
|---|---|---|---|---|
| Absent | 0.941176 | 0.711111 | 0.810127 | 45.000000 |
| Present | 0.723404 | 0.944444 | 0.819277 | 36.000000 |
| accuracy | 0.814815 | 0.814815 | 0.814815 | 0.814815 |
| macro avg | 0.832290 | 0.827778 | 0.814702 | 81.000000 |
| weighted avg | 0.844389 | 0.814815 | 0.814193 | 81.000000 |